Description

Twitter sentiment analysis.

Load the libraries

In [1]:
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')
In [50]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import mlxtend
import plotly_express as px

pd.options.plotting.backend = "plotly"
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',1000)

import time,os,json,sys
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print([(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,mlxtend,px]])

#=========Visualization
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)


#========= NLP
import re
import string
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])

#=======OTHERS
import ast
import scipy
import multiprocessing as mp
import gc
import operator
from collections import defaultdict

#=====Warnigns
import warnings
warnings.simplefilter("ignore")
# plottig warnings
[('numpy', '1.17.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('sklearn', '0.23.1'), ('mlxtend', '0.17.0'), ('plotly_express', '0.4.1')]
[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]
In [3]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

Load the data

In [4]:
import ast
In [5]:
df_combined = pd.read_csv('../data/processed/df_combined_clean.csv')

# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

# we need to make list as list type
df_combined[mcl] = df_combined[mcl].apply(ast.literal_eval)
df_combined[mcle] = df_combined[mcle].apply(ast.literal_eval)

df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]

print(f"shape df_train: {df_train.shape}")
print(f"shape df_test: {df_test.shape}")

df_train.head(2).append(df_train.tail(2))
shape df_train: (7920, 24)
shape df_test: (1953, 24)
Out[5]:
index id label tweet tweet_lst_clean tweet_clean hashtags_lst hashtags total_length num_words num_sent num_unique_words num_words_title num_uppercase num_exclamation_marks num_question_marks num_punctuation num_symbols num_digits avg_word_len avg_uppercase avg_unique tweet_lst_clean_emoji tweet_clean_emoji
0 0 1 0.0 #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone 128 13 1 13 2 5 0 0 2 0 0 8.923077 0.039062 1.0 [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1 1 2 0.0 Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] finaly transparant silicon case thanks uncle yay sony xperia sonyexperias ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] #yay #Sony #Xperia #S #sonyexperias… 131 17 1 17 5 12 0 0 3 0 0 6.764706 0.091603 1.0 [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
7918 7918 7919 0.0 Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/ [finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget] finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget ['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#'] #smart #pocket #wifi #ipad #samsung #s3 #gadget # 133 16 1 16 1 5 1 0 3 0 0 7.375000 0.037594 1.0 [finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget] finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget
7919 7919 7920 0.0 Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/ [aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew] aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew ['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…'] #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… 129 13 1 13 5 12 3 0 2 0 0 9.000000 0.093023 1.0 [aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew] aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew
In [6]:
df = df_train
df_pos = df[df['label']==0.0] # it's 0 NOT 1
df_neg = df[df['label']==1.0]

Target Distribution

In [7]:
sns.countplot(df[target])
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8fe4062f10>
In [8]:
df[target].value_counts().plot.bar()

PCA plot

In [9]:
df1 = df.sample(1000)

df1['tfidf'] = df[mce].pipe(hero.tfidf)
df1['pca'] = df1['tfidf'].pipe(hero.pca)
df1['kmeans_labels'] = df1['tfidf'].pipe(hero.kmeans,n_clusters=2)

hero.scatterplot(df1, 'pca', color='kmeans_labels')

Frequency Distribution

In [10]:
df.head(2).T
Out[10]:
0 1
index 0 1
id 1 2
label 0 0
tweet #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
tweet_lst_clean [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]
tweet_clean fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone finaly transparant silicon case thanks uncle yay sony xperia sonyexperias
hashtags_lst ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…']
hashtags #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone #yay #Sony #Xperia #S #sonyexperias…
total_length 128 131
num_words 13 17
num_sent 1 1
num_unique_words 13 17
num_words_title 2 5
num_uppercase 5 12
num_exclamation_marks 0 0
num_question_marks 0 0
num_punctuation 2 3
num_symbols 0 0
num_digits 0 0
avg_word_len 8.92308 6.76471
avg_uppercase 0.0390625 0.0916031
avg_unique 1 1
tweet_lst_clean_emoji [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]
tweet_clean_emoji fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
In [11]:
df[mcle].head(2)[0]
Out[11]:
['fingerprint',
 'pregnancy',
 'test',
 'android',
 'aps',
 'beautiful',
 'cute',
 'health',
 'iger',
 'iphone',
 'iphones',
 'iphone']
In [12]:
arr_all_words = df[mcle].sum()
arr_pos_words = df[df[target]==0.0][mcle].sum()
arr_neg_words = df[df[target]==1.0][mcle].sum()

print(f"len arr_all_words: {len(arr_all_words)}")
print(f"len arr_pos_words: {len(arr_pos_words)}")
print(f"len arr_neg_words: {len(arr_neg_words)}")
len arr_all_words: 112290
len arr_pos_words: 89487
len arr_neg_words: 22803
In [13]:
from collections import Counter

df_freq = pd.DataFrame(Counter(arr_all_words).most_common())
df_freq_pos = pd.DataFrame(Counter(arr_pos_words).most_common())
df_freq_neg = pd.DataFrame(Counter(arr_neg_words).most_common())

df_freq_pos.head()
Out[13]:
0 1
0 iphone 4153
1 aple 1737
2 samsung 1424
3 pic 1253
4 insta 1247
In [14]:
df_freq = pd.DataFrame(np.unique(arr_all_words,return_counts=True)).T
df_freq.head(2).append(df_freq.tail(2))
Out[14]:
0 1
0 aa 7
1 aaa 2
10603 zy 13
10604 zz 1
In [15]:
fdist = nltk.FreqDist(arr_all_words)

print([i for i in dir(fdist) if i[0]!='_'])
['B', 'N', 'Nr', 'clear', 'copy', 'elements', 'freq', 'fromkeys', 'get', 'hapaxes', 'items', 'keys', 'max', 'most_common', 'pformat', 'plot', 'pop', 'popitem', 'pprint', 'r_Nr', 'setdefault', 'subtract', 'tabulate', 'unicode_repr', 'update', 'values']
In [16]:
df_freq = pd.DataFrame(fdist.most_common(20))
df_freq.head()
Out[16]:
0 1
0 iphone 4831
1 aple 3183
2 samsung 1574
3 new 1381
4 pic 1305
In [17]:
fdist.plot(20)
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8f33605f50>
In [18]:
df_freq = hero.top_words(df[mce]).to_frame()
df_freq.head(2)
Out[18]:
tweet_clean_emoji
iphone 4831
aple 3183
In [19]:
hero.top_words(df[mce],normalize=True).to_frame().head().mul(100)
Out[19]:
tweet_clean_emoji
iphone 4.302253
aple 2.834625
samsung 1.401728
new 1.229851
pic 1.162169

Word Cloud

In [20]:
hero.wordcloud(df[mce])
In [64]:
from wordcloud import WordCloud
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[30, 30])

wordcloud1 = WordCloud( background_color='white',
                        width=800,
                        height=600
                      ).generate(' '.join(arr_pos_words))


ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Positive Tweets',fontsize=40);

wordcloud2 = WordCloud( background_color='white',
                        width=800,
                        height=600
                      ).generate(' '.join(arr_neg_words))

ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Negative Tweets',fontsize=40);
In [80]:
from plotly_wordcloud import plotly_wordcloud

text = " ".join(arr_pos_words)

fig = plotly_wordcloud(text)
fig['layout']['title'] = 'Wordcloud for +Ve Tweets'
fig['layout']['height'] = 800
fig['layout']['width'] = 800
py.iplot(fig)
In [84]:
import inspect
# inspect.getsourcelines(plotly_wordcloud)

Treemap

In [98]:
fig = px.treemap(df_pos_uni.head(20),
           path=['Word'],values='Count',
           title='Top +Ve Twitter Words')

fig['layout']['title']['x'] = 0.5
fig.show()
In [95]:
fig = px.treemap(df_neg_uni.head(20),path=['Word'],values='Count')

fig.update_layout(
    title={
        'text': "Top -Ve Twitter Words",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

New Features Comparison

In [21]:
df.columns
Out[21]:
Index(['index', 'id', 'label', 'tweet', 'tweet_lst_clean', 'tweet_clean',
       'hashtags_lst', 'hashtags', 'total_length', 'num_words', 'num_sent',
       'num_unique_words', 'num_words_title', 'num_uppercase',
       'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
       'num_symbols', 'num_digits', 'avg_word_len', 'avg_uppercase',
       'avg_unique', 'tweet_lst_clean_emoji', 'tweet_clean_emoji'],
      dtype='object')
In [22]:
import warnings
warnings.simplefilter("ignore")
In [23]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
In [24]:
NEG_TWEETS = df_train[target] == 1

def compare_distplots(df_train,features):
    fig, axes = plt.subplots(ncols=2, nrows=len(features), figsize=(20, 50), dpi=100)

    for i, feature in enumerate(features):
        sns.distplot(df_train.loc[~NEG_TWEETS][feature], label='Positive', ax=axes[i][0], color='green')
        sns.distplot(df_train.loc[NEG_TWEETS][feature], label='Negative', ax=axes[i][0], color='red')

        sns.distplot(df_train[feature], label='Training', ax=axes[i][1])
        sns.distplot(df_test[feature], label='Test', ax=axes[i][1])

        for j in range(2):
            axes[i][j].set_xlabel('')
            axes[i][j].tick_params(axis='x', labelsize=20)
            axes[i][j].tick_params(axis='y', labelsize=20)
            axes[i][j].legend(fontsize=20)

        axes[i][0].set_title(f'{feature} Target Distribution in Training Set', fontsize=20)
        axes[i][1].set_title(f'{feature} Training & Test Set Distribution', fontsize=20)

    plt.show()
In [25]:
features = ['total_length', 'num_words', 'num_sent',
    'num_unique_words', 'num_words_title', 'num_uppercase',
    'num_exclamation_marks' ]

compare_distplots(df_train,features)
In [26]:
features = ['num_question_marks', 
    'num_punctuation','num_symbols', 'num_digits',
    'avg_word_len', 'avg_uppercase']
compare_distplots(df_train,features)
In [27]:
note = """
NOTE:
The distriubtion must be different between label +ve and -ve
but must be similar between train and test sets.


""";

N-grams

In [28]:
from wordcloud import STOPWORDS
In [29]:
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]
In [30]:
def get_ngram_dfs(df_train,NEG_TWEETS,col,n_gram=1):

    # NEG_TWEETS = df_train[target] == 1
    neg_ngrams = defaultdict(int)
    pos_ngrams = defaultdict(int)

    for tweet in df_train[NEG_TWEETS][col]:
        for word in generate_ngrams(tweet,n_gram=n_gram):
            neg_ngrams[word] += 1

    for tweet in df_train[~NEG_TWEETS][col]:
        for word in generate_ngrams(tweet,n_gram=n_gram):
            pos_ngrams[word] += 1

    df_neg_ngrams = pd.DataFrame(sorted(neg_ngrams.items(),
                                          key=lambda x: x[1])[::-1])

    df_pos_ngrams = pd.DataFrame(sorted(pos_ngrams.items(),
                                          key=lambda x: x[1])[::-1])

    return [df_neg_ngrams,df_pos_ngrams]
In [31]:
def plot_neg_pos_ngrams(n_gram_name,
            df_neg_ngrams,df_pos_ngrams,N=20):

    FS = 25
    fig, axes = plt.subplots(ncols=2, figsize=(18, 20), dpi=100)
    plt.tight_layout()

    sns.barplot(y=df_neg_ngrams[0].values[:N], x=df_neg_ngrams[1].values[:N], ax=axes[0], color='red')
    sns.barplot(y=df_pos_ngrams[0].values[:N], x=df_pos_ngrams[1].values[:N], ax=axes[1], color='green')

    for i in range(2):
        axes[i].spines['right'].set_visible(False)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
        axes[i].tick_params(axis='x', labelsize=FS)
        axes[i].tick_params(axis='y', labelsize=FS)

    axes[0].set_title(f'Top {N} most common {n_gram_name} in -Ve Tweets', fontsize=FS)
    axes[1].set_title(f'Top {N} most common {n_gram_name} in +Ve Tweets', fontsize=FS)

    plt.show()

Uni-grams

In [32]:
df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=1)
plot_neg_pos_ngrams('Bigrams',df1,df2,N=20)

Bi-grams

In [33]:
df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=2)
plot_neg_pos_ngrams('Bigrams',df1,df2,N=20)

Tri-grams

In [34]:
df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=3)
plot_neg_pos_ngrams('Trigrams',df1,df2,N=20)

N-grams using plotly

In [49]:
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
In [35]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according
    to occurrence in a text corpus.
    """
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
In [36]:
df1.head()
Out[36]:
0 1
0 aple pic twiter 10
1 fuck aple aple 7
2 hate aple retwet 7
3 aple suck aple 7
4 aple aple iphone 6
In [37]:
df1.head(20)[::-1].plot.bar(x=1,y=0)
In [38]:
df_pos = df[df[target]==0.0]
df_neg = df[df[target]==1.0]

df_pos.head(2)
Out[38]:
index id label tweet tweet_lst_clean tweet_clean hashtags_lst hashtags total_length num_words num_sent num_unique_words num_words_title num_uppercase num_exclamation_marks num_question_marks num_punctuation num_symbols num_digits avg_word_len avg_uppercase avg_unique tweet_lst_clean_emoji tweet_clean_emoji
0 0 1 0.0 #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone 128 13 1 13 2 5 0 0 2 0 0 8.923077 0.039062 1.0 [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1 1 2 0.0 Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] finaly transparant silicon case thanks uncle yay sony xperia sonyexperias ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] #yay #Sony #Xperia #S #sonyexperias… 131 17 1 17 5 12 0 0 3 0 0 6.764706 0.091603 1.0 [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
In [39]:
pos_uni = get_top_n_words(df_pos[mce],20)
neg_uni = get_top_n_words(df_neg[mce],20)

df_pos_uni = pd.DataFrame(pos_uni,columns=['Word','Count'])[::-1]
df_neg_uni = pd.DataFrame(neg_uni,columns=['Word','Count'])[::-1]
In [45]:
fig = df_pos_uni.plot.bar(x='Count',y='Word')
fig.update_traces(marker_color='green', opacity=0.6)
fig.update_layout(title_text='Most frequent +Ve **Unigrams** ')
fig.show()
In [47]:
fig = df_neg_uni.plot.bar(x='Count',y='Word')
fig.update_traces(marker_color='red', opacity=0.6)
fig.update_layout(title_text='Most frequent -Ve **Unigrams** ')
fig.show()
In [ ]: